Prepration¶
InĀ [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from helper import get_latest_table
InĀ [2]:
current_month = pd.Timestamp.now().month
current_year = pd.Timestamp.now().year
cpu_data = get_latest_table('cpu_specs')
gpu_data = get_latest_table('gpu_specs')
full_relation = get_latest_table('full_relation')
Connection to PostgreSQL DB successful Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful Connection to PostgreSQL DB successful
Connection to PostgreSQL DB successful Connection to PostgreSQL DB successful
Preview the data
InĀ [3]:
print(f"CPU Data: {cpu_data.shape[0]} rows, {cpu_data.shape[1]} columns")
print(f"GPU Data: {gpu_data.shape[0]} rows, {gpu_data.shape[1]} columns")
print(f"Full Relation Data: {full_relation.shape[0]} rows, {full_relation.shape[1]} columns")
CPU Data: 2348 rows, 28 columns GPU Data: 618 rows, 13 columns Full Relation Data: 1959 rows, 70 columns
Data Analasys¶
CPU Dataframe¶
Preview the data¶
Dataframe head¶
InĀ [4]:
# Display the first few rows
print(cpu_data.head())
name performance_clockspeed performance_turbospeed \ 0 intel core i3 1315ue 1.20 4.50 1 intel core i3 n300 None 3.80 2 intel core i3 1305u 1.60 4.50 3 amd ryzen 3 7320u 2.40 4.10 4 intel core i5 1038ng7 2.00 3.80 performance_cores performance_threads efficient_clockspeed \ 0 2.0 4.0 None 1 8.0 8.0 None 2 1.0 2.0 1.20 3 4.0 8.0 None 4 4.0 8.0 None efficient_turbospeed efficient_cores efficient_threads tdp ... \ 0 3.30 4.0 4.0 15.00 ... 1 None NaN NaN 7.00 ... 2 3.30 4.0 4.0 15.00 ... 3 None NaN NaN 15.00 ... 4 None NaN NaN 28.00 ... eff_l2_cache integer_math floating_point_math find_prime_numbers \ 0 None 34537.0 20958.0 51.0 1 None 29169.0 19343.0 22.0 2 1 x 2048 kb 27950.0 20052.0 36.0 3 None 29638.0 14121.0 20.0 4 None 27545.0 15238.0 28.0 random_string_sorting data_encryption data_compression physics \ 0 10759.0 6321.0 103162.0 824.0 1 12797.0 7034.0 100731.0 516.0 2 10623.0 6021.0 95060.0 518.0 3 13922.0 6266.0 131689.0 437.0 4 11471.0 5714.0 109286.0 698.0 extended_instructions single_thread 0 5172.0 3269 1 5174.0 2122 2 5262.0 3276 3 5905.0 2378 4 6539.0 2152 [5 rows x 28 columns]
Dataframe tail¶
InĀ [5]:
# Display the first few rows
print(cpu_data.tail())
name performance_clockspeed \
2343 intel u300e 1.10
2344 arm huawei,kunpeng 920 24 core 2.60
2345 amd custom apu 0932 2.40
2346 intel core i7 10710u 1.10
2347 intel core i3 1125g4 2.00
performance_turbospeed performance_cores performance_threads \
2343 4.30 1.0 2.0
2344 None 24.0 24.0
2345 3.50 4.0 8.0
2346 4.70 6.0 12.0
2347 3.70 4.0 8.0
efficient_clockspeed efficient_turbospeed efficient_cores \
2343 None 3.20 4.0
2344 None None NaN
2345 None None NaN
2346 None None NaN
2347 None None NaN
efficient_threads tdp ... eff_l2_cache integer_math \
2343 4.0 15.00 ... 1 x 2048 kb 30218.0
2344 NaN None ... None 91062.0
2345 NaN 15.00 ... None 28027.0
2346 NaN 15.00 ... None 35167.0
2347 NaN 28.00 ... None 29716.0
floating_point_math find_prime_numbers random_string_sorting \
2343 21589.0 45.0 11513.0
2344 30906.0 48.0 40681.0
2345 17049.0 23.0 14366.0
2346 21715.0 31.0 16853.0
2347 18257.0 34.0 12839.0
data_encryption data_compression physics extended_instructions \
2343 6421.0 98379.0 599.0 5279.0
2344 2447.0 94224.0 822.0 10829.0
2345 7582.0 117043.0 613.0 6566.0
2346 3269.0 128017.0 642.0 8051.0
2347 5666.0 107758.0 577.0 7990.0
single_thread
2343 3546
2344 733
2345 2263
2346 2336
2347 2476
[5 rows x 28 columns]
Check all the features¶
InĀ [6]:
print(cpu_data.columns)
Index(['name', 'performance_clockspeed', 'performance_turbospeed',
'performance_cores', 'performance_threads', 'efficient_clockspeed',
'efficient_turbospeed', 'efficient_cores', 'efficient_threads', 'tdp',
'multithread_rating', 'single_thread_rating', 'l1_instruction_cache',
'l1_data_cache', 'l2_cache', 'l3_cache', 'eff_l1_instruction_cache',
'eff_l1_data_cache', 'eff_l2_cache', 'integer_math',
'floating_point_math', 'find_prime_numbers', 'random_string_sorting',
'data_encryption', 'data_compression', 'physics',
'extended_instructions', 'single_thread'],
dtype='object')
Check the data types and non-null counts¶
InĀ [7]:
print(cpu_data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2348 entries, 0 to 2347 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 2348 non-null object 1 performance_clockspeed 2338 non-null object 2 performance_turbospeed 926 non-null object 3 performance_cores 2259 non-null float64 4 performance_threads 2259 non-null float64 5 efficient_clockspeed 129 non-null object 6 efficient_turbospeed 115 non-null object 7 efficient_cores 163 non-null float64 8 efficient_threads 163 non-null float64 9 tdp 1441 non-null object 10 multithread_rating 2348 non-null int64 11 single_thread_rating 2348 non-null int64 12 l1_instruction_cache 1409 non-null object 13 l1_data_cache 1407 non-null object 14 l2_cache 1405 non-null object 15 l3_cache 868 non-null object 16 eff_l1_instruction_cache 103 non-null object 17 eff_l1_data_cache 103 non-null object 18 eff_l2_cache 93 non-null object 19 integer_math 2149 non-null float64 20 floating_point_math 2149 non-null float64 21 find_prime_numbers 2012 non-null float64 22 random_string_sorting 2149 non-null float64 23 data_encryption 1155 non-null float64 24 data_compression 2149 non-null float64 25 physics 2149 non-null float64 26 extended_instructions 2149 non-null float64 27 single_thread 2348 non-null int64 dtypes: float64(12), int64(3), object(13) memory usage: 513.8+ KB None
Look at descriptive statistics¶
InĀ [8]:
print(cpu_data.describe())
performance_cores performance_threads efficient_cores \
count 2259.000000 2259.000000 163.000000
mean 4.544046 5.947764 6.791411
std 2.801395 4.002537 2.879159
min 1.000000 1.000000 2.000000
25% 2.000000 4.000000 4.000000
50% 4.000000 4.000000 8.000000
75% 8.000000 8.000000 8.000000
max 32.000000 32.000000 16.000000
efficient_threads multithread_rating single_thread_rating \
count 163.000000 2348.000000 2348.000000
mean 6.957055 5056.670358 1393.641823
std 3.081906 7341.559596 1016.341297
min 2.000000 93.000000 95.000000
25% 4.000000 840.500000 568.000000
50% 8.000000 2168.500000 1086.500000
75% 8.000000 5709.250000 1951.250000
max 16.000000 57389.000000 4786.000000
integer_math floating_point_math find_prime_numbers \
count 2149.000000 2149.000000 2012.000000
mean 21716.518381 11977.772918 24.540258
std 25417.697425 18214.180379 48.549368
min 122.000000 166.000000 1.000000
25% 5139.000000 1985.000000 5.000000
50% 13523.000000 4760.000000 10.000000
75% 25358.000000 12910.000000 23.250000
max 209791.000000 131787.000000 619.000000
random_string_sorting data_encryption data_compression physics \
count 2149.000000 1155.000000 2149.000000 2149.000000
mean 9679.062355 6004.123810 73248.829688 367.891112
std 10193.531835 6337.221465 90040.342566 519.354517
min 294.000000 1025.000000 2023.000000 14.000000
25% 2917.000000 1869.000000 18278.000000 93.000000
50% 5869.000000 3258.000000 38372.000000 184.000000
75% 12586.000000 7656.500000 90448.000000 415.000000
max 81685.000000 43769.000000 719086.000000 6476.000000
extended_instructions single_thread
count 2149.000000 2348.000000
mean 3776.795254 1393.641823
std 6023.329546 1016.341297
min 25.000000 95.000000
25% 537.000000 568.000000
50% 1354.000000 1086.500000
75% 3514.000000 1951.250000
max 52490.000000 4786.000000
Feature Analysis¶
Overall Performance Ratings¶
Features:
multithread_rating,single_thread_rating
Distribution of ratings¶
InĀ [9]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot single_thread_rating distribution
sns.histplot(cpu_data['single_thread_rating'], ax=axes[0], color='blue', kde=True)
axes[0].set_title("Single Thread Rating Distribution")
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Frequency')
# Plot multithread_rating distribution
sns.histplot(cpu_data['multithread_rating'], ax=axes[1], color='green', kde=True)
axes[1].set_title("Multithread Rating Distribution")
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Frequency')
# Adjust layout
plt.tight_layout()
# Display the plot
plt.show()
InĀ [10]:
# Generate statistics for single_thread_rating
single_thread_stats = cpu_data['single_thread_rating'].describe()
print("Single Thread Rating Statistics:")
print(single_thread_stats)
# Generate statistics for multithread_rating
multithread_stats = cpu_data['multithread_rating'].describe()
print("\nMultithread Rating Statistics:")
print(multithread_stats)
Single Thread Rating Statistics: count 2348.000000 mean 1393.641823 std 1016.341297 min 95.000000 25% 568.000000 50% 1086.500000 75% 1951.250000 max 4786.000000 Name: single_thread_rating, dtype: float64 Multithread Rating Statistics: count 2348.000000 mean 5056.670358 std 7341.559596 min 93.000000 25% 840.500000 50% 2168.500000 75% 5709.250000 max 57389.000000 Name: multithread_rating, dtype: float64
Single vs Multithreaded¶
InĀ [11]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cpu_data, x='single_thread_rating', y='multithread_rating', alpha=0.7)
# Add titles and labels
plt.title("Single Thread Rating vs Multithread Rating", fontsize=16)
plt.xlabel("Single Thread Rating", fontsize=14)
plt.ylabel("Multithread Rating", fontsize=14)
plt.grid(True)
# Show the plot
plt.show()
# Calculate and print the correlation
correlation = cpu_data['single_thread_rating'].corr(cpu_data['multithread_rating'])
print(f"The correlation between single_thread_rating and multithread_rating is: {correlation:.2f}")
The correlation between single_thread_rating and multithread_rating is: 0.88
Clockspeed metrics¶
Features:
performance_clockspeed,performance_turbospeedefficient_clockspeed,efficient_turbospeed
Distribution¶
InĀ [12]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot performance clockspeed
sns.kdeplot(cpu_data['performance_clockspeed'].dropna(), ax=axes[0, 0], color='blue', fill=True)
axes[0, 0].set_title("Performance Cores' Clockspeed Distribution")
axes[0, 0].set_xlabel('Clockspeed (GHz)')
axes[0, 0].set_ylabel('Density')
# Plot performance turbospeed
sns.kdeplot(cpu_data['performance_turbospeed'].dropna(), ax=axes[0, 1], color='green', fill=True)
axes[0, 1].set_title("Performance Cores' Turbospeed Distribution")
axes[0, 1].set_xlabel('Turbospeed (GHz)')
axes[0, 1].set_ylabel('Density')
# Plot efficient clockspeed
sns.kdeplot(cpu_data['efficient_clockspeed'].dropna(), ax=axes[1, 0], color='red', fill=True)
axes[1, 0].set_title("Efficient Cores' Clockspeed Distribution")
axes[1, 0].set_xlabel('Clockspeed (GHz)')
axes[1, 0].set_ylabel('Density')
# Plot efficient turbospeed
sns.kdeplot(cpu_data['efficient_turbospeed'].dropna(), ax=axes[1, 1], color='purple', fill=True)
axes[1, 1].set_title("Efficient Cores' Turbospeed Distribution")
axes[1, 1].set_xlabel('Turbospeed (GHz)')
axes[1, 1].set_ylabel('Density')
# Determine common x and y limits for all plots
x_min = min(
cpu_data['performance_clockspeed'].min(),
cpu_data['performance_turbospeed'].min(),
cpu_data['efficient_clockspeed'].min(),
cpu_data['efficient_turbospeed'].min(),
)
x_max = max(
cpu_data['performance_clockspeed'].max(),
cpu_data['performance_turbospeed'].max(),
cpu_data['efficient_clockspeed'].max(),
cpu_data['efficient_turbospeed'].max(),
)
y_max = max(ax.get_ylim()[1] for ax in axes.flat) # Find the maximum y limit among all plots
# Set common limits
for ax in axes.flat:
ax.set_xlim(x_min, x_max)
ax.set_ylim(0, y_max)
# Adjust layout
plt.tight_layout()
# Display the plot
plt.show()
Correlation with Performance¶
InĀ [13]:
# Calculate correlations
correlation_performance_single = cpu_data['performance_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_performance_multi = cpu_data['performance_clockspeed'].corr(cpu_data['multithread_rating'])
correlation_efficient_single = cpu_data['efficient_clockspeed'].corr(cpu_data['single_thread_rating'])
correlation_efficient_multi = cpu_data['efficient_clockspeed'].corr(cpu_data['multithread_rating'])
# Print the results
print(f"Correlation between performance_clockspeed and single_thread_rating: {correlation_performance_single:.2f}")
print(f"Correlation between performance_clockspeed and multithread_rating: {correlation_performance_multi:.2f}")
print(f"Correlation between efficient_clockspeed and single_thread_rating: {correlation_efficient_single:.2f}")
print(f"Correlation between efficient_clockspeed and multithread_rating: {correlation_efficient_multi:.2f}")
Correlation between performance_clockspeed and single_thread_rating: 0.61 Correlation between performance_clockspeed and multithread_rating: 0.48 Correlation between efficient_clockspeed and single_thread_rating: 0.21 Correlation between efficient_clockspeed and multithread_rating: 0.14
InĀ [14]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()
# Ensure columns are numeric
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')
cpu_data_clone['single_thread_rating'] = pd.to_numeric(cpu_data_clone['single_thread_rating'], errors='coerce')
cpu_data_clone['multithread_rating'] = pd.to_numeric(cpu_data_clone['multithread_rating'], errors='coerce')
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot performance_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='single_thread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Clockspeed vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')
# Plot performance_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='performance_clockspeed', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Clockspeed vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Clockspeed (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')
# Plot efficient_clockspeed vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='single_thread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Clockspeed vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')
# Plot efficient_clockspeed vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='efficient_clockspeed', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Clockspeed vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Clockspeed (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')
plt.tight_layout()
plt.show()
Boost impact¶
InĀ [15]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()
# Convert columns to numeric, forcing errors to NaN
cpu_data_clone['performance_turbospeed'] = pd.to_numeric(cpu_data_clone['performance_turbospeed'], errors='coerce')
cpu_data_clone['performance_clockspeed'] = pd.to_numeric(cpu_data_clone['performance_clockspeed'], errors='coerce')
cpu_data_clone['efficient_turbospeed'] = pd.to_numeric(cpu_data_clone['efficient_turbospeed'], errors='coerce')
cpu_data_clone['efficient_clockspeed'] = pd.to_numeric(cpu_data_clone['efficient_clockspeed'], errors='coerce')
# Compute turbo boost margins
cpu_data_clone['performance_turbo_boost'] = cpu_data_clone['performance_turbospeed'] - cpu_data_clone['performance_clockspeed']
cpu_data_clone['efficient_turbo_boost'] = cpu_data_clone['efficient_turbospeed'] - cpu_data_clone['efficient_clockspeed']
# Analyze turbo boost impact on single_thread_rating and multithread_rating
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Performance turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='single_thread_rating', ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Performance Turbo Boost vs Single Thread Rating')
axes[0, 0].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 0].set_ylabel('Single Thread Rating')
# Performance turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='performance_turbo_boost', y='multithread_rating', ax=axes[0, 1], color='green')
axes[0, 1].set_title('Performance Turbo Boost vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Turbo Boost (GHz)')
axes[0, 1].set_ylabel('Multithread Rating')
# Efficient turbo boost vs single_thread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='single_thread_rating', ax=axes[1, 0], color='red')
axes[1, 0].set_title('Efficient Turbo Boost vs Single Thread Rating')
axes[1, 0].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 0].set_ylabel('Single Thread Rating')
# Efficient turbo boost vs multithread_rating
sns.regplot(data=cpu_data_clone, x='efficient_turbo_boost', y='multithread_rating', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Efficient Turbo Boost vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Turbo Boost (GHz)')
axes[1, 1].set_ylabel('Multithread Rating')
plt.tight_layout()
plt.show()
Core & Thread Analysis¶
Features:
performance_cores,performance_threadsefficient_cores,efficient_threads
Distribution¶
InĀ [16]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot performance cores
sns.histplot(cpu_data['performance_cores'].dropna(), ax=axes[0, 0], color='blue', kde=True)
axes[0, 0].set_title("Performance Cores Distribution")
axes[0, 0].set_xlabel('Number of Cores')
axes[0, 0].set_ylabel('Frequency')
# Plot performance threads
sns.histplot(cpu_data['performance_threads'].dropna(), ax=axes[0, 1], color='green', kde=True)
axes[0, 1].set_title("Performance Threads Distribution")
axes[0, 1].set_xlabel('Number of Threads')
axes[0, 1].set_ylabel('Frequency')
# Plot efficient cores
sns.histplot(cpu_data['efficient_cores'].dropna(), ax=axes[1, 0], color='red', kde=True)
axes[1, 0].set_title("Efficient Cores Distribution")
axes[1, 0].set_xlabel('Number of Cores')
axes[1, 0].set_ylabel('Frequency')
# Plot efficient threads
sns.histplot(cpu_data['efficient_threads'].dropna(), ax=axes[1, 1], color='purple', kde=True)
axes[1, 1].set_title("Efficient Threads Distribution")
axes[1, 1].set_xlabel('Number of Threads')
axes[1, 1].set_ylabel('Frequency')
# Determine common x and y limits for all plots
x_min = min(
cpu_data['performance_cores'].min(),
cpu_data['performance_threads'].min(),
cpu_data['efficient_cores'].min(),
cpu_data['efficient_threads'].min(),
)
x_max = max(
cpu_data['performance_cores'].max(),
cpu_data['performance_threads'].max(),
cpu_data['efficient_cores'].max(),
cpu_data['efficient_threads'].max(),
)
y_max = max(ax.get_ylim()[1] for ax in axes.flat) # Get the maximum y-limit among all plots
# Set common x and y limits for all subplots
for ax in axes.flat:
ax.set_xlim(x_min, x_max)
ax.set_ylim(0, y_max)
# Adjust layout
plt.tight_layout()
# Display the plot
plt.show()
InĀ [17]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()
# Calculate core/thread ratio for performance and efficient cores
cpu_data_clone['performance_core_thread_ratio'] = cpu_data_clone['performance_cores'] / cpu_data_clone['performance_threads']
cpu_data_clone['efficient_core_thread_ratio'] = cpu_data_clone['efficient_cores'] / cpu_data_clone['efficient_threads']
# Calculate frequency counts for each ratio
performance_ratio_counts = cpu_data_clone['performance_core_thread_ratio'].value_counts().sort_index()
efficient_ratio_counts = cpu_data_clone['efficient_core_thread_ratio'].value_counts().sort_index()
# Print the frequency of core/thread ratios
print("Performance Core/Thread Ratio Frequencies:")
print(performance_ratio_counts)
print("\nEfficient Core/Thread Ratio Frequencies:")
print(efficient_ratio_counts)
Performance Core/Thread Ratio Frequencies: performance_core_thread_ratio 0.5 774 1.0 1485 Name: count, dtype: int64 Efficient Core/Thread Ratio Frequencies: efficient_core_thread_ratio 0.5 4 1.0 159 Name: count, dtype: int64
Multi-threading impact¶
InĀ [18]:
# Calculate correlations
correlation_performance_cores = cpu_data['performance_cores'].corr(cpu_data['multithread_rating'])
correlation_performance_threads = cpu_data['performance_threads'].corr(cpu_data['multithread_rating'])
correlation_efficient_cores = cpu_data['efficient_cores'].corr(cpu_data['multithread_rating'])
correlation_efficient_threads = cpu_data['efficient_threads'].corr(cpu_data['multithread_rating'])
# Print the results
print(f"Correlation between performance_cores and multithread_rating: {correlation_performance_cores:.2f}")
print(f"Correlation between performance_threads and multithread_rating: {correlation_performance_threads:.2f}")
print(f"Correlation between efficient_cores and multithread_rating: {correlation_efficient_cores:.2f}")
print(f"Correlation between efficient_threads and multithread_rating: {correlation_efficient_threads:.2f}")
Correlation between performance_cores and multithread_rating: 0.41 Correlation between performance_threads and multithread_rating: 0.74 Correlation between efficient_cores and multithread_rating: 0.47 Correlation between efficient_threads and multithread_rating: 0.49
InĀ [19]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot performance_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_cores', y='multithread_rating', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title('Performance Cores vs Multithread Rating')
axes[0, 0].set_xlabel('Performance Cores')
axes[0, 0].set_ylabel('Multithread Rating')
# Plot performance_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='performance_threads', y='multithread_rating', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title('Performance Threads vs Multithread Rating')
axes[0, 1].set_xlabel('Performance Threads')
axes[0, 1].set_ylabel('Multithread Rating')
# Plot efficient_cores vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_cores', y='multithread_rating', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title('Efficient Cores vs Multithread Rating')
axes[1, 0].set_xlabel('Efficient Cores')
axes[1, 0].set_ylabel('Multithread Rating')
# Plot efficient_threads vs multithread_rating with regression line
sns.regplot(data=cpu_data, x='efficient_threads', y='multithread_rating', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title('Efficient Threads vs Multithread Rating')
axes[1, 1].set_xlabel('Efficient Threads')
axes[1, 1].set_ylabel('Multithread Rating')
plt.tight_layout()
plt.show()
Power Consumption (TDP)¶
Features:
TDP
TDP vs Performance¶
InĀ [20]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()
# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')
# Calculate correlations
correlation_tdp_single = cpu_data_clone['tdp'].corr(cpu_data_clone['single_thread_rating'])
correlation_tdp_multi = cpu_data_clone['tdp'].corr(cpu_data_clone['multithread_rating'])
# Print the results
print(f"Correlation between TDP and single_thread_rating: {correlation_tdp_single:.2f}")
print(f"Correlation between TDP and multithread_rating: {correlation_tdp_multi:.2f}")
Correlation between TDP and single_thread_rating: 0.39 Correlation between TDP and multithread_rating: 0.43
InĀ [21]:
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot TDP vs single_thread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='single_thread_rating', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('TDP vs Single Thread Rating')
axes[0].set_xlabel('TDP (W)')
axes[0].set_ylabel('Single Thread Rating')
# Plot TDP vs multithread_rating with regression line
sns.regplot(data=cpu_data_clone, x='tdp', y='multithread_rating', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('TDP vs Multithread Rating')
axes[1].set_xlabel('TDP (W)')
axes[1].set_ylabel('Multithread Rating')
plt.tight_layout()
plt.show()
Efficiency Analysis¶
InĀ [22]:
# Clone the cpu_data DataFrame
cpu_data_clone = cpu_data.copy()
# Ensure 'tdp' column is numeric
cpu_data_clone['tdp'] = pd.to_numeric(cpu_data_clone['tdp'], errors='coerce')
# Calculate performance efficiency
cpu_data_clone['performance_efficiency'] = cpu_data_clone['multithread_rating'] / cpu_data_clone['tdp']
# Drop rows with NaN values in 'performance_efficiency'
cpu_data_clone = cpu_data_clone.dropna(subset=['performance_efficiency'])
# Filter out rows where 'performance_efficiency' is less than or equal to 0
cpu_data_clone = cpu_data_clone[cpu_data_clone['performance_efficiency'] > 0]
# Sort the DataFrame by 'performance_efficiency'
cpu_data_clone = cpu_data_clone.sort_values(by='performance_efficiency', ascending=False)
# Display the top 5 rows of the updated DataFrame
print("Top 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].head())
# Display the bottom 5 rows of the updated DataFrame
print("\nBottom 5 rows:")
print(cpu_data_clone[['name', 'multithread_rating', 'tdp', 'performance_efficiency']].tail())
# Plot the distribution of performance efficiency
plt.figure(figsize=(10, 6))
sns.histplot(cpu_data_clone['performance_efficiency'], kde=True, color="blue", bins=30)
plt.title("Performance Efficiency Distribution", fontsize=16)
plt.xlabel("Performance Efficiency (multithread_rating / tdp)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
Top 5 rows:
name multithread_rating tdp performance_efficiency
117 intel core ultra 7 164u 15187 9.0 1687.444444
470 amd ryzen z1 extreme 25182 15.0 1678.800000
82 apple a18 pro 13063 8.0 1632.875000
91 intel core i7 1260u 14001 9.0 1555.666667
52 intel core i7 1250u 11673 9.0 1297.000000
Bottom 5 rows:
name multithread_rating tdp \
735 mobile amd athlon xp-m 1800+ 193 45.0
980 mobile amd athlon 64 3400+ 333 81.5
974 mobile amd athlon 64 3200+ 326 81.5
671 intel celeron b710 106 35.0
681 mobile intel celeron 1.80ghz 121 66.1
performance_efficiency
735 4.288889
980 4.085890
974 4.000000
671 3.028571
681 1.830560
GPU Dataframe¶
Preview the data¶
Dataframe head¶
InĀ [23]:
# Display the first few rows
print(gpu_data.head())
name avg_g3d_mark bus_interface max_memory_size \ 0 radeon rx 6600m 13814 pcie 4.0 x8 8192.0 1 radeont rx 6850m xt 13848 pcie 4.0 x16 12288.0 2 rtx 1000 ada generation 14043 None NaN 3 rtx a3000 12gb 14088 None NaN 4 geforce rtx 4050 14433 pcie 4.0 x16 6144.0 core_clock max_direct open_gl max_tdp test_directx_9 test_directx_10 \ 0 2068.0 12_2 4.6 100.0 180.0 89.0 1 2321.0 12_2 4.6 165.0 144.0 106.0 2 NaN None None NaN 179.0 74.0 3 NaN None None NaN 169.0 88.0 4 1605.0 12_2 4.6 115.0 186.0 81.0 test_directx_11 test_directx_12 test_gpu_compute 0 135.0 52.0 5752.0 1 166.0 59.0 5210.0 2 115.0 65.0 5471.0 3 115.0 65.0 5593.0 4 131.0 61.0 5943.0
Dataframe tail¶
InĀ [24]:
# Display the last few rows
print(gpu_data.tail())
name avg_g3d_mark bus_interface max_memory_size \
613 radeon rx 7900m 22752 None NaN
614 rtx 4000 ada generation 22962 None NaN
615 rtx 5000 ada generation 24006 None NaN
616 geforce rtx 4080 25076 pcie 4.0 x16 12288.0
617 geforce rtx 4090 27754 pcie 4.0 x16 16384.0
core_clock max_direct open_gl max_tdp test_directx_9 test_directx_10 \
613 NaN None None NaN 267.0 127.0
614 NaN None None NaN 271.0 140.0
615 NaN None None NaN 272.0 153.0
616 1860.0 12_2 4.6 150.0 286.0 161.0
617 1455.0 12_2 4.6 150.0 315.0 181.0
test_directx_11 test_directx_12 test_gpu_compute
613 256.0 93.0 9297.0
614 224.0 100.0 9232.0
615 239.0 102.0 9553.0
616 248.0 96.0 11422.0
617 270.0 107.0 12650.0
Check all the features¶
InĀ [25]:
print(gpu_data.columns)
Index(['name', 'avg_g3d_mark', 'bus_interface', 'max_memory_size',
'core_clock', 'max_direct', 'open_gl', 'max_tdp', 'test_directx_9',
'test_directx_10', 'test_directx_11', 'test_directx_12',
'test_gpu_compute'],
dtype='object')
Check the data types and non-null counts¶
InĀ [26]:
print(gpu_data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 618 entries, 0 to 617 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 618 non-null object 1 avg_g3d_mark 618 non-null int64 2 bus_interface 349 non-null object 3 max_memory_size 342 non-null float64 4 core_clock 309 non-null float64 5 max_direct 353 non-null object 6 open_gl 346 non-null object 7 max_tdp 245 non-null float64 8 test_directx_9 340 non-null float64 9 test_directx_10 340 non-null float64 10 test_directx_11 340 non-null float64 11 test_directx_12 340 non-null float64 12 test_gpu_compute 340 non-null float64 dtypes: float64(8), int64(1), object(4) memory usage: 62.9+ KB None
Look at descriptive statistics¶
InĀ [27]:
print(gpu_data.describe())
avg_g3d_mark max_memory_size core_clock max_tdp test_directx_9 \
count 618.000000 342.000000 309.000000 245.000000 340.000000
mean 2784.377023 2852.590643 756.132686 58.142857 64.752941
std 4605.472224 3298.820120 366.364012 38.361524 67.330367
min 2.000000 2.000000 143.000000 7.000000 1.000000
25% 358.000000 512.000000 500.000000 25.000000 11.000000
50% 671.500000 2048.000000 660.000000 50.000000 36.000000
75% 2697.000000 4096.000000 954.000000 80.000000 107.250000
max 27754.000000 16384.000000 2321.000000 165.000000 315.000000
test_directx_10 test_directx_11 test_directx_12 test_gpu_compute
count 340.000000 340.000000 340.000000 340.000000
mean 26.597059 38.252941 19.311765 1892.626471
std 37.149137 51.488414 24.423437 2288.641490
min 0.000000 0.000000 0.000000 0.000000
25% 2.000000 4.000000 0.000000 239.500000
50% 7.000000 15.000000 7.500000 806.000000
75% 35.000000 54.000000 31.000000 2865.000000
max 181.000000 270.000000 107.000000 12650.000000
Feature Analysis¶
Clock Speed Analysis¶
Features:
core_clock
Distribution¶
InĀ [28]:
# Plot the distribution of core_clock
plt.figure(figsize=(10, 6))
sns.histplot(gpu_data['core_clock'].dropna(), kde=True, color='blue', bins=30)
# Add labels and title
plt.title("Distribution of GPU Core Clock Speeds", fontsize=16)
plt.xlabel("Core Clock (MHz)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# Show the plot
plt.show()
Impact on Performance¶
InĀ [29]:
# Calculate correlation coefficients
correlation_core_clock_avg_g3d_mark = gpu_data['core_clock'].corr(gpu_data['avg_g3d_mark'])
correlation_core_clock_test_directx_9 = gpu_data['core_clock'].corr(gpu_data['test_directx_9'])
correlation_core_clock_test_directx_10 = gpu_data['core_clock'].corr(gpu_data['test_directx_10'])
correlation_core_clock_test_directx_11 = gpu_data['core_clock'].corr(gpu_data['test_directx_11'])
correlation_core_clock_test_directx_12 = gpu_data['core_clock'].corr(gpu_data['test_directx_12'])
correlation_core_clock_test_gpu_compute = gpu_data['core_clock'].corr(gpu_data['test_gpu_compute'])
# Print correlation coefficients
print(f"Correlation between core_clock and avg_g3d_mark: {correlation_core_clock_avg_g3d_mark:.2f}")
print(f"Correlation between core_clock and test_directx_9: {correlation_core_clock_test_directx_9:.2f}")
print(f"Correlation between core_clock and test_directx_10: {correlation_core_clock_test_directx_10:.2f}")
print(f"Correlation between core_clock and test_directx_11: {correlation_core_clock_test_directx_11:.2f}")
print(f"Correlation between core_clock and test_directx_12: {correlation_core_clock_test_directx_12:.2f}")
print(f"Correlation between core_clock and test_gpu_compute: {correlation_core_clock_test_gpu_compute:.2f}")
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(3, 2, figsize=(14, 18))
# Plot core_clock vs avg_g3d_mark
sns.regplot(data=gpu_data, x='core_clock', y='avg_g3d_mark', ax=axes[0, 0], color='blue', scatter_kws={'s': 10})
axes[0, 0].set_title(f"Core Clock vs Avg G3D Mark (Correlation: {correlation_core_clock_avg_g3d_mark:.2f})")
# Plot core_clock vs test_directx_9
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_9', ax=axes[0, 1], color='green', scatter_kws={'s': 10})
axes[0, 1].set_title(f"Core Clock vs Test DirectX 9 (Correlation: {correlation_core_clock_test_directx_9:.2f})")
# Plot core_clock vs test_directx_10
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_10', ax=axes[1, 0], color='red', scatter_kws={'s': 10})
axes[1, 0].set_title(f"Core Clock vs Test DirectX 10 (Correlation: {correlation_core_clock_test_directx_10:.2f})")
# Plot core_clock vs test_directx_11
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_11', ax=axes[1, 1], color='purple', scatter_kws={'s': 10})
axes[1, 1].set_title(f"Core Clock vs Test DirectX 11 (Correlation: {correlation_core_clock_test_directx_11:.2f})")
# Plot core_clock vs test_directx_12
sns.regplot(data=gpu_data, x='core_clock', y='test_directx_12', ax=axes[2, 0], color='orange', scatter_kws={'s': 10})
axes[2, 0].set_title(f"Core Clock vs Test DirectX 12 (Correlation: {correlation_core_clock_test_directx_12:.2f})")
# Plot core_clock vs test_gpu_compute
sns.regplot(data=gpu_data, x='core_clock', y='test_gpu_compute', ax=axes[2, 1], color='brown', scatter_kws={'s': 10})
axes[2, 1].set_title(f"Core Clock vs Test GPU Compute (Correlation: {correlation_core_clock_test_gpu_compute:.2f})")
# Adjust layout
plt.tight_layout()
# Display the plot
plt.show()
Correlation between core_clock and avg_g3d_mark: 0.71 Correlation between core_clock and test_directx_9: 0.70 Correlation between core_clock and test_directx_10: 0.63 Correlation between core_clock and test_directx_11: 0.68 Correlation between core_clock and test_directx_12: 0.70 Correlation between core_clock and test_gpu_compute: 0.68
Memory and Bandwidth Analysis¶
Features:
max_memory_sizebus_interface
Memory Size¶
InĀ [30]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()
# Define the memory size categories with handling for NaN values
def categorize_memory_size(memory_size):
if pd.isna(memory_size): # Check if the value is NaN
return 'Unknown'
elif memory_size <= 2048:
return '<2GB'
elif 2048 < memory_size <= 4096:
return '2ā4GB'
elif 4096 < memory_size <= 8192:
return '4ā8GB'
elif 8192 < memory_size <= 16384:
return '8ā16GB'
else:
return '>16GB'
# Apply the categorization function to the 'max_memory_size' column
gpu_data_clone['memory_size_category'] = gpu_data_clone['max_memory_size'].apply(categorize_memory_size)
# Group by the memory size category and calculate the average avg_g3d_mark
memory_size_comparison = gpu_data_clone.groupby('memory_size_category')['avg_g3d_mark'].mean()
# Exclude the 'Unknown' category from the comparison
memory_size_comparison = memory_size_comparison[memory_size_comparison.index != 'Unknown']
# Check the unique categories in the memory_size_comparison DataFrame
print("Unique categories in memory_size_comparison:", memory_size_comparison.index)
# Define the custom order of memory size categories
category_order = ['<2GB', '2ā4GB', '4ā8GB', '8ā16GB', '>16GB']
# Ensure that the order only includes categories that are present in the data
category_order = [category for category in category_order if category in memory_size_comparison.index]
# Sort the memory_size_comparison based on the custom order
memory_size_comparison = memory_size_comparison[category_order]
# Print the results
print(memory_size_comparison)
# Plot the comparison
plt.figure(figsize=(10, 6))
memory_size_comparison.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Average G3D Mark by GPU Memory Size Category", fontsize=16)
plt.xlabel("Memory Size Category", fontsize=14)
plt.ylabel("Average G3D Mark", fontsize=14)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# Show the plot
plt.show()
Unique categories in memory_size_comparison: Index(['2ā4GB', '4ā8GB', '8ā16GB', '<2GB'], dtype='object', name='memory_size_category') memory_size_category <2GB 579.118182 2ā4GB 3846.544118 4ā8GB 11477.357143 8ā16GB 16148.416667 Name: avg_g3d_mark, dtype: float64
Bus Interface¶
InĀ [31]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()
# Filter out rows with missing bus_interface or avg_g3d_mark
filtered_gpu_data_clone = gpu_data_clone.dropna(subset=['bus_interface', 'avg_g3d_mark'])
# Group by bus_interface and calculate the average avg_g3d_mark
bus_interface_performance = filtered_gpu_data_clone.groupby('bus_interface')['avg_g3d_mark'].mean().sort_values()
# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
plt.title("Impact of Bus Interface on GPU Performance (avg_g3d_mark)", fontsize=16)
plt.xlabel("Average G3D Mark", fontsize=14)
plt.ylabel("Bus Interface", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Show the plot
plt.show()
/tmp/ipykernel_670/3052634162.py:12: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(y=bus_interface_performance.index, x=bus_interface_performance.values, palette="viridis", orient='h')
Power Consumption (TDP)¶
Features:
max_tdp
Performance vs Power¶
InĀ [32]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()
# Ensure 'max_tdp' column is numeric
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')
# Calculate correlation
correlation_tdp_g3d = gpu_data_clone['max_tdp'].corr(gpu_data_clone['avg_g3d_mark'])
# Print the correlation result
print(f"Correlation between max_tdp and avg_g3d_mark: {correlation_tdp_g3d:.2f}")
# Plot the relationship
plt.figure(figsize=(10, 6))
sns.regplot(data=gpu_data_clone, x='max_tdp', y='avg_g3d_mark', color='blue', scatter_kws={'s': 10})
# Add titles and labels
plt.title("Max TDP vs Avg G3D Mark", fontsize=16)
plt.xlabel("Max TDP (W)", fontsize=14)
plt.ylabel("Avg G3D Mark", fontsize=14)
plt.grid(True)
# Show the plot
plt.show()
Correlation between max_tdp and avg_g3d_mark: 0.75
Efficiency¶
InĀ [33]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()
# Ensure 'avg_g3d_mark' and 'max_tdp' columns are numeric
gpu_data_clone['avg_g3d_mark'] = pd.to_numeric(gpu_data_clone['avg_g3d_mark'], errors='coerce')
gpu_data_clone['max_tdp'] = pd.to_numeric(gpu_data_clone['max_tdp'], errors='coerce')
# Exclude rows where 'max_tdp' is NaN
gpu_data_clone = gpu_data_clone.dropna(subset=['max_tdp'])
# Compute performance efficiency
gpu_data_clone['efficiency'] = gpu_data_clone['avg_g3d_mark'] / gpu_data_clone['max_tdp']
# Sort the DataFrame by 'efficiency'
gpu_data_sorted = gpu_data_clone.sort_values(by='efficiency', ascending=False)
# Display the top 5 rows of the sorted DataFrame
print("Top 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].head())
# Display the bottom 5 rows of the sorted DataFrame
print("\nBottom 5 GPUs by Efficiency:")
print(gpu_data_sorted[['name', 'avg_g3d_mark', 'max_tdp', 'efficiency']].tail())
Top 5 GPUs by Efficiency:
name avg_g3d_mark max_tdp efficiency
510 radeon pro w6300 5560 25.0 222.400000
591 radeon rx 7600s 14732 75.0 196.426667
593 radeon rx 6700s 14974 80.0 187.175000
617 geforce rtx 4090 27754 150.0 185.026667
556 radeon pro 5600m 9233 50.0 184.660000
Bottom 5 GPUs by Efficiency:
name avg_g3d_mark max_tdp efficiency
103 radeon hd 6320 147 45.0 3.266667
121 geforce go 7800 gtx 210 65.0 3.230769
84 radeon hd 6310 122 45.0 2.711111
63 radeon hd 6250 94 35.0 2.685714
70 radeon hd 6290 105 45.0 2.333333
Overall Performance Ratings¶
Features:
avg_g3d_mark(3DMark score)test_gpu_compute(compute performance)
Distribution of ratings¶
InĀ [34]:
# Plot the distribution of avg_g3d_mark
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['avg_g3d_mark'].dropna(), kde=True, color='blue', bins=30)
plt.title("Distribution of Avg G3D Mark", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
# Plot the distribution of test_gpu_compute
plt.figure(figsize=(12, 6))
sns.histplot(gpu_data['test_gpu_compute'].dropna(), kde=True, color='green', bins=30)
plt.title("Distribution of Test GPU Compute", fontsize=16)
plt.xlabel("Test GPU Compute", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(True)
plt.show()
Compute vs Gaming¶
InĀ [35]:
# Clone the gpu_data DataFrame
gpu_data_clone = gpu_data.copy()
# Create performance categories based on avg_g3d_mark
bins = [0, 2000, 4000, 6000, 8000, 10000]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
gpu_data_clone['performance_category'] = pd.cut(gpu_data_clone['avg_g3d_mark'], bins=bins, labels=labels)
# Calculate correlation
correlation_gaming_compute = gpu_data_clone['avg_g3d_mark'].corr(gpu_data_clone['test_gpu_compute'])
# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=gpu_data_clone, x='avg_g3d_mark', y='test_gpu_compute', hue='performance_category', alpha=0.7)
# Add titles and labels
plt.title(f"Avg G3D Mark vs Test GPU Compute (Correlation: {correlation_gaming_compute:.2f})", fontsize=16)
plt.xlabel("Avg G3D Mark (Gaming Performance)", fontsize=14)
plt.ylabel("Test GPU Compute (Compute Performance)", fontsize=14)
plt.grid(True)
# Show plot
plt.show()
# Print correlation
print(f"The correlation between avg_g3d_mark and test_gpu_compute is: {correlation_gaming_compute:.2f}")
The correlation between avg_g3d_mark and test_gpu_compute is: 0.99
Full Laptop Dataframe¶
Source (Laptop Shop)¶
Analyzing number of laptops from each source¶
InĀ [36]:
# Get the unique values and their counts
source_counts = full_relation['laptop_specs_source'].value_counts()
# Plot the unique values and their counts
plt.figure(figsize=(10, 6))
ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
plt.title("Number of laptops per shop", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Source", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Add data labels
for container in ax.containers:
ax.bar_label(container, fmt='%d')
# Show the plot
plt.show()
/tmp/ipykernel_670/3058434414.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(y=source_counts.index, x=source_counts.values, palette="viridis")
Analysising price grouped by source¶
InĀ [37]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by brand/source
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by Source", fontsize=16)
plt.ylabel("Source", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/3580318046.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation, y='laptop_specs_source', x='laptop_specs_price', palette="viridis")
InĀ [38]:
# Group by 'laptop_specs_source' and calculate descriptive statistics for 'laptop_specs_price'
price_stats_by_source = full_relation.groupby('laptop_specs_source')['laptop_specs_price'].describe()
# Print the statistics
print(price_stats_by_source)
count mean std min \
laptop_specs_source
cellphones 264.0 2.940125e+07 2.012227e+07 9490000.0
fptshop 211.0 2.747815e+07 1.818169e+07 9490000.0
gearvn 148.0 2.554818e+07 1.225296e+07 11990000.0
hacom 482.0 2.435385e+07 1.207721e+07 8799000.0
laptopaz 198.0 2.660227e+07 1.243102e+07 11990000.0
laptopworld 77.0 3.024714e+07 1.227698e+07 16290000.0
nguyenkim 55.0 1.755909e+07 5.782377e+06 9790000.0
phongvu 296.0 2.511128e+07 1.133528e+07 9490000.0
thegioididong 228.0 1.986680e+07 6.136243e+06 7890000.0
25% 50% 75% max
laptop_specs_source
cellphones 17140000.0 23840000.0 34990000.0 182490000.0
fptshop 16490000.0 21990000.0 31440000.0 128990000.0
gearvn 18265000.0 22140000.0 25840000.0 89990000.0
hacom 16799000.0 21199000.0 29374000.0 95699000.0
laptopaz 17990000.0 23990000.0 30490000.0 85000000.0
laptopworld 21990000.0 27490000.0 34390000.0 88490000.0
nguyenkim 13640000.0 16790000.0 20990000.0 32990000.0
phongvu 17990000.0 21990000.0 27990000.0 83990000.0
thegioididong 16390000.0 18990000.0 22490000.0 70690000.0
Brand¶
Analysing number of laptops from each brand¶
InĀ [39]:
# Get the unique values and their counts
brand_counts = full_relation['laptop_specs_brand'].value_counts()
# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
plt.title("Number of laptops per brand", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Brand", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Add data labels
for container in ax.containers:
ax.bar_label(container, fmt='%d')
# Show the plot
plt.show()
/tmp/ipykernel_670/2418909123.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(y=brand_counts.index, x=brand_counts.values, palette="viridis")
Analysising price grouped by brand¶
InĀ [40]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by brand
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by Brand", fontsize=16)
plt.ylabel("Brand", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/2449895171.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation, y='laptop_specs_brand', x='laptop_specs_price', palette="viridis")
Central Processing Unit (CPU)¶
Basic analysis¶
InĀ [41]:
# Group by 'laptop_specs_cpu' and calculate the mean price and count
mean_price_by_cpu = full_relation.groupby('laptop_specs_cpu')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique CPUs:", mean_price_by_cpu.shape[0], end='\n\n')
# Sort the DataFrame by mean price
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='mean', ascending=False)
# Format the mean price as currency
mean_price_by_cpu['mean'] = mean_price_by_cpu['mean'].apply(lambda x: f"{x:,.2f}Ä")
# Display the results
print("Top 10 CPUs by Mean Price:")
print(mean_price_by_cpu.head(10), '\n\n')
print("Bottom 10 CPUs by Mean Price:")
print(mean_price_by_cpu.tail(10), '\n\n')
# Sort the DataFrame by count
mean_price_by_cpu = mean_price_by_cpu.sort_values(by='count', ascending=False)
# Display the results
print("Top 10 CPUs by Count:")
print(mean_price_by_cpu.head(10), '\n\n')
print("Bottom 10 CPUs by Count:")
print(mean_price_by_cpu.tail(10), '\n\n')
Number of unique CPUs: 131
Top 10 CPUs by Mean Price:
mean count
laptop_specs_cpu
apple m3 max 16 core 138,740,000.00Ä 2
apple m2 max 12 core 105,990,000.00Ä 1
apple m4 max 16 core 102,490,000.00Ä 2
intel core i9 13980hx 90,240,000.00Ä 4
apple m4 max 14 core 86,656,666.67Ä 3
intel core i9 13950hx 85,699,000.00Ä 1
intel core i9 11900h 85,000,000.00Ä 1
apple m3 max 14 core 82,490,000.00Ä 5
intel core i7 13850hx 73,049,000.00Ä 2
intel core i9 10885h 72,990,000.00Ä 1
Bottom 10 CPUs by Mean Price:
mean count
laptop_specs_cpu
amd ryzen 5 5500u 12,994,500.00Ä 2
intel core 3 100u 12,990,000.00Ä 1
intel core i3 1220p 12,490,000.00Ä 1
amd ryzen 7 5700u 12,415,153.85Ä 13
amd ryzen 5 7520u 12,104,454.55Ä 22
intel core i3 1315u 11,889,387.10Ä 31
intel core i3 8145u 11,640,000.00Ä 2
intel core i3 1305u 11,531,272.73Ä 11
intel core i3 1215u 9,968,217.39Ä 23
intel celeron n4500 8,340,000.00Ä 2
Top 10 CPUs by Count:
mean count
laptop_specs_cpu
intel core i5 13420h 18,797,554.62Ä 119
intel core ultra 7 155h 34,566,491.53Ä 118
intel core i5 1335u 17,550,769.91Ä 113
intel core i7 13620h 23,585,819.82Ä 111
intel core i7 1355u 21,408,989.80Ä 98
intel core i5 1235u 15,204,220.78Ä 77
intel core ultra 5 125h 24,521,516.13Ä 62
intel core i5 1334u 16,736,633.33Ä 60
intel core i5 12450h 17,077,685.19Ä 54
apple m2 8 core 30,680,660.00Ä 50
Bottom 10 CPUs by Count:
mean count
laptop_specs_cpu
intel core i5 1230u 24,490,000.00Ä 1
intel core i7 1250u 24,999,000.00Ä 1
amd ryzen 7 4800h 17,890,000.00Ä 1
amd ryzen 7 6800h 17,690,000.00Ä 1
intel core i5 1345u 24,999,000.00Ä 1
intel core ultra 7 256v 26,690,000.00Ä 1
qualcomm snapdragon x elite - x1e-78-100 31,190,000.00Ä 1
intel core ultra 7 165u 31,490,000.00Ä 1
intel core i5 11320h 15,990,000.00Ä 1
amd ryzen 7 5800hs 23,990,000.00Ä 1
Analyzing CPU performance relation with price¶
InĀ [42]:
# Calculate correlations
correlation_multithread_price = full_relation['cpu_specs_multithread_rating'].corr(full_relation['laptop_specs_price'])
correlation_single_thread_price = full_relation['cpu_specs_single_thread_rating'].corr(full_relation['laptop_specs_price'])
# Print the results
print(f"Correlation between multithread_rating and price: {correlation_multithread_price:.2f}")
print(f"Correlation between single_thread_rating and price: {correlation_single_thread_price:.2f}")
Correlation between multithread_rating and price: 0.57 Correlation between single_thread_rating and price: 0.50
InĀ [43]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot single_thread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_single_thread_rating', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Single Thread Rating vs Price')
axes[0].set_xlabel('Single Thread Rating')
axes[0].set_ylabel('Price')
# Plot multithread_rating vs price with regression line
sns.regplot(data=full_relation, x='cpu_specs_multithread_rating', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Multithread Rating vs Price')
axes[1].set_xlabel('Multithread Rating')
axes[1].set_ylabel('Price')
plt.tight_layout()
plt.show()
Graphics Processing Unit (GPU)¶
Basic analysis¶
InĀ [44]:
# Group by 'laptop_specs_gpu' and calculate the mean price and count
mean_price_by_gpu = full_relation.groupby('laptop_specs_vga')['laptop_specs_price'].agg(['mean', 'count'])
print("Number of unique GPUs:", mean_price_by_gpu.shape[0], end='\n\n')
# Sort the DataFrame by mean price
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='mean', ascending=False)
# Format the mean price as currency
mean_price_by_gpu['mean'] = mean_price_by_gpu['mean'].apply(lambda x: f"{x:,.2f}Ä")
# Display the results
print("Top 10 GPUs by Mean Price:")
print(mean_price_by_gpu.head(10), '\n\n')
print("Bottom 10 GPUs by Mean Price:")
print(mean_price_by_gpu.tail(10), '\n\n')
# Sort the DataFrame by count
mean_price_by_gpu = mean_price_by_gpu.sort_values(by='count', ascending=False)
# Display the results
print("Top 10 GPUs by Count:")
print(mean_price_by_gpu.head(10), '\n\n')
print("Bottom 10 GPUs by Count:")
print(mean_price_by_gpu.tail(10), '\n\n')
Number of unique GPUs: 23
Top 10 GPUs by Mean Price:
mean count
laptop_specs_vga
geforce rtx 4090 93,490,000.00Ä 4
geforce rtx 4080 76,677,500.00Ä 8
rtx 2000 ada generation 75,153,571.43Ä 7
geforce rtx 2060 55,990,000.00Ä 1
rtx a1000 49,597,000.00Ä 4
geforce rtx 4070 47,671,538.46Ä 26
rtx a500 47,532,333.33Ä 3
geforce gtx 1650 ti 40,990,000.00Ä 2
geforce rtx 3070 ti 37,490,000.00Ä 1
geforce rtx 3060 35,521,900.00Ä 10
Bottom 10 GPUs by Mean Price:
mean count
laptop_specs_vga
geforce mx570 25,099,000.00Ä 2
radeon rx 7600s 23,490,000.00Ä 1
geforce mx450 22,994,500.00Ä 2
geforce rtx 3050 22,533,222.89Ä 166
geforce mx550 20,602,454.55Ä 11
geforce rtx 2050 18,259,426.23Ä 61
geforce mx250 18,190,000.00Ä 1
geforce mx350 17,990,000.00Ä 1
geforce gtx 1650 17,623,333.33Ä 3
radeon rx 6550m 15,540,000.00Ä 2
Top 10 GPUs by Count:
mean count
laptop_specs_vga
geforce rtx 3050 22,533,222.89Ä 166
geforce rtx 4050 27,156,802.63Ä 152
geforce rtx 4060 34,472,539.82Ä 113
geforce rtx 2050 18,259,426.23Ä 61
geforce rtx 4070 47,671,538.46Ä 26
geforce mx550 20,602,454.55Ä 11
geforce rtx 3050 ti 30,670,000.00Ä 10
geforce rtx 3060 35,521,900.00Ä 10
geforce rtx 4080 76,677,500.00Ä 8
rtx 2000 ada generation 75,153,571.43Ä 7
Bottom 10 GPUs by Count:
mean count
laptop_specs_vga
geforce gtx 1650 17,623,333.33Ä 3
geforce gtx 1650 ti 40,990,000.00Ä 2
geforce mx570 25,099,000.00Ä 2
geforce mx450 22,994,500.00Ä 2
radeon rx 6550m 15,540,000.00Ä 2
geforce rtx 3070 ti 37,490,000.00Ä 1
radeon rx 7600s 23,490,000.00Ä 1
geforce rtx 2060 55,990,000.00Ä 1
geforce mx250 18,190,000.00Ä 1
geforce mx350 17,990,000.00Ä 1
Analyzing GPU performance relation with price¶
InĀ [45]:
# Calculate the correlation between avg_g3d_mark and price
correlation_avg_g3d_mark_price = full_relation['gpu_specs_avg_g3d_mark'].corr(full_relation['laptop_specs_price'])
# Print the correlation result
print(f"Correlation between avg_g3d_mark and price: {correlation_avg_g3d_mark_price:.2f}")
Correlation between avg_g3d_mark and price: 0.59
InĀ [46]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='gpu_specs_avg_g3d_mark', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})
# Add titles and labels
plt.title("Correlation between Avg G3D Mark and Price", fontsize=16)
plt.xlabel("Avg G3D Mark", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Random Access Memory (RAM)¶
Basic analysis¶
InĀ [47]:
# Print unique values and their counts for RAM amount
ram_amount_counts = full_relation['laptop_specs_ram_amount'].value_counts()
print("Unique RAM amounts and their counts:")
print(ram_amount_counts)
# Print unique values and their counts for RAM type
ram_type_counts = full_relation['laptop_specs_ram_type'].value_counts()
print("\nUnique RAM types and their counts:")
print(ram_type_counts)
Unique RAM amounts and their counts: laptop_specs_ram_amount 16.0 1202 8.0 459 32.0 182 24.0 46 4.0 19 12.0 16 36.0 14 64.0 9 48.0 4 18.0 4 96.0 1 128.0 1 Name: count, dtype: int64 Unique RAM types and their counts: laptop_specs_ram_type ddr5 1043 ddr4 752 Name: count, dtype: int64
InĀ [48]:
# Convert RAM amount to categorical type
full_relation['laptop_specs_ram_amount'] = pd.Categorical(full_relation['laptop_specs_ram_amount'])
# Plot the unique values and their counts horizontally
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by RAM Amount", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("RAM Amount (GB)", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Add data labels
for container in ax.containers:
ax.bar_label(container, fmt='%d')
# Show the plot
plt.show()
/tmp/ipykernel_670/3967087379.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(x=ram_amount_counts.index.astype(int).astype(str), y=ram_amount_counts.values, palette="viridis")
InĀ [49]:
# Plot the pie chart for RAM types
plt.figure(figsize=(8, 8))
ram_type_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99'], labels=ram_type_counts.index, wedgeprops=dict(width=0.3))
# Add title
plt.title("Distribution of RAM Types", fontsize=16)
# Show the plot
plt.show()
Analyzing RAM performance relation with price¶
InĀ [50]:
# Calculate the correlation between RAM amount and price
correlation_ram_price = full_relation['laptop_specs_ram_amount'].astype(float).corr(full_relation['laptop_specs_price'])
# Print the correlation result
print(f"Correlation between RAM amount and price: {correlation_ram_price:.2f}")
Correlation between RAM amount and price: 0.66
InĀ [51]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by RAM amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by RAM Amount", fontsize=16)
plt.xlabel("RAM Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/1837874617.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation, x='laptop_specs_ram_amount', y='laptop_specs_price', palette="viridis")
InĀ [52]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a KDE plot for price distribution by RAM type
plt.figure(figsize=(14, 8))
sns.kdeplot(data=full_relation, x='laptop_specs_price', hue='laptop_specs_ram_type', fill=True, palette="viridis")
# Add titles and labels
plt.title("Price Distribution by RAM Type", fontsize=16)
plt.xlabel("Price", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Storage¶
Basic analysis¶
InĀ [53]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()
# Convert 'laptop_specs_storage_amount' to numeric type
full_relation_clone['laptop_specs_storage_amount'] = pd.to_numeric(full_relation_clone['laptop_specs_storage_amount'], errors='coerce')
# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_storage_amount'] >= 128]
# Print unique values and their counts for storage amount
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
print("Unique storage amounts and their counts:")
print(storage_amount_counts)
# Print unique values and their counts for storage type
storage_type_counts = full_relation_clone['laptop_specs_storage_type'].value_counts()
print("\nUnique storage types and their counts:")
print(storage_type_counts)
Unique storage amounts and their counts: laptop_specs_storage_amount 512.0 1227 1024.0 338 256.0 121 2048.0 14 8192.0 1 Name: count, dtype: int64 Unique storage types and their counts: laptop_specs_storage_type ssd 1636 hdd 6 Name: count, dtype: int64
InĀ [54]:
# Convert storage amount to categorical type
full_relation_clone['laptop_specs_storage_amount'] = pd.Categorical(full_relation_clone['laptop_specs_storage_amount'])
# Plot the unique values and their counts horizontally
storage_amount_counts = full_relation_clone['laptop_specs_storage_amount'].value_counts()
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
plt.title("Number of Laptops by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Add data labels
for container in ax.containers:
ax.bar_label(container, fmt='%d')
# Show the plot
plt.show()
/tmp/ipykernel_670/4095148162.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(x=storage_amount_counts.index.astype(int).astype(str), y=storage_amount_counts.values, palette="viridis")
Analyzing Storage relation with price¶
InĀ [55]:
# Calculate the correlation between storage amount and price
correlation_storage_price = full_relation_clone['laptop_specs_storage_amount'].astype(float).corr(full_relation_clone['laptop_specs_price'])
# Print the correlation result
print(f"Correlation between storage amount and price: {correlation_storage_price:.2f}")
Correlation between storage amount and price: 0.56
InĀ [56]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by storage amount
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by Storage Amount", fontsize=16)
plt.xlabel("Storage Amount (GB)", fontsize=14)
plt.ylabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/4036590408.py:6: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation_clone, x='laptop_specs_storage_amount', y='laptop_specs_price', palette="viridis")
Screen Features¶
Basic analysis¶
InĀ [57]:
# Calculate summary statistics for screen size, refresh rate, and brightness
screen_size_stats = full_relation['laptop_specs_screen_size'].describe()
refresh_rate_stats = full_relation['laptop_specs_screen_refresh_rate'].describe()
brightness_stats = full_relation['laptop_specs_screen_brightness'].describe()
# Print the results
print("Summary Statistics for Screen Size:")
print(screen_size_stats)
print("\nSummary Statistics for Screen Refresh Rate:")
print(refresh_rate_stats)
print("\nSummary Statistics for Screen Brightness:")
print(brightness_stats)
Summary Statistics for Screen Size: count 1707.000000 mean 14.971822 std 0.959482 min 13.000000 25% 14.000000 50% 15.600000 75% 15.600000 max 18.000000 Name: laptop_specs_screen_size, dtype: float64 Summary Statistics for Screen Refresh Rate: count 1266.000000 mean 109.504739 std 47.309397 min 60.000000 25% 60.000000 50% 120.000000 75% 144.000000 max 480.000000 Name: laptop_specs_screen_refresh_rate, dtype: float64 Summary Statistics for Screen Brightness: count 1148.000000 mean 333.719512 std 103.811275 min 220.000000 25% 250.000000 50% 300.000000 75% 400.000000 max 1200.000000 Name: laptop_specs_screen_brightness, dtype: float64
InĀ [58]:
# Print unique values and their counts for screen resolution
screen_resolution_counts = full_relation['laptop_specs_screen_resolution'].value_counts()
print("Unique screen resolutions and their counts:")
print(screen_resolution_counts)
# Plot the unique values and their counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
plt.title("Number of Laptops by Screen Resolution", fontsize=16)
plt.xlabel("Count", fontsize=14)
plt.ylabel("Screen Resolution", fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
# Add data labels
for container in ax.containers:
ax.bar_label(container, fmt='%d')
# Show the plot
plt.show()
Unique screen resolutions and their counts: laptop_specs_screen_resolution 1920x1080 978 1920x1200 406 2880x1800 156 2560x1600 144 3024x1964 31 2880x1864 29 2560x1664 21 2880x1920 16 2880x1620 13 3840x2400 12 2560x1644 12 3200x2000 12 3456x2234 11 2560x1440 10 2240x1400 7 3072x1920 6 2048x1280 6 1366x768 5 3456x2160 2 2960x1848 1 2220x1080 1 3201x2000 1 2256x1504 1 3000x2000 1 3840x2160 1 2160x1440 1 Name: count, dtype: int64
/tmp/ipykernel_670/3570925166.py:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(y=screen_resolution_counts.index, x=screen_resolution_counts.values, palette="viridis")
Analysis of screen features with price¶
InĀ [59]:
# Clone the full_relation DataFrame
full_relation_clone = full_relation.copy()
# Get the counts of each screen resolution
screen_resolution_counts = full_relation_clone['laptop_specs_screen_resolution'].value_counts()
# Filter out screen resolutions with count < 20
filtered_screen_resolutions = screen_resolution_counts[screen_resolution_counts >= 20].index
# Filter the DataFrame
full_relation_clone = full_relation_clone[full_relation_clone['laptop_specs_screen_resolution'].isin(filtered_screen_resolutions)]
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by screen resolution
# Sort the DataFrame by screen resolution
full_relation_clone = full_relation_clone.sort_values(by='laptop_specs_screen_resolution')
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by Screen Resolution", fontsize=16)
plt.ylabel("Screen Resolution", fontsize=14)
plt.xlabel("Price", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/1749291397.py:21: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation_clone, y='laptop_specs_screen_resolution', x='laptop_specs_price', palette="viridis")
InĀ [60]:
# Print the correlation
correlation_screen_size_price = full_relation['laptop_specs_screen_size'].corr(full_relation['laptop_specs_price'])
correlation_refresh_rate_price = full_relation['laptop_specs_screen_refresh_rate'].corr(full_relation['laptop_specs_price'])
correlation_brightness_price = full_relation['laptop_specs_screen_brightness'].corr(full_relation['laptop_specs_price'])
print(f"Correlation between screen size and price: {correlation_screen_size_price:.2f}")
print(f"Correlation between screen refresh rate and price: {correlation_refresh_rate_price:.2f}")
print(f"Correlation between screen brightness and price: {correlation_brightness_price:.2f}")
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Plot price vs. screen size
sns.regplot(data=full_relation, x='laptop_specs_screen_size', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'alpha':0.7})
axes[0].set_title('Price vs. Screen Size')
axes[0].set_xlabel('Screen Size (inches)')
axes[0].set_ylabel('Price (VND)')
# Plot price vs. screen refresh rate
sns.regplot(data=full_relation, x='laptop_specs_screen_refresh_rate', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'alpha':0.7})
axes[1].set_title('Price vs. Screen Refresh Rate')
axes[1].set_xlabel('Screen Refresh Rate (Hz)')
axes[1].set_ylabel('Price (VND)')
# Plot price vs. screen brightness
sns.regplot(data=full_relation, x='laptop_specs_screen_brightness', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'alpha':0.7})
axes[2].set_title('Price vs. Screen Brightness')
axes[2].set_xlabel('Screen Brightness (nits)')
axes[2].set_ylabel('Price (VND)')
plt.tight_layout()
plt.show()
Correlation between screen size and price: 0.06 Correlation between screen refresh rate and price: 0.29 Correlation between screen brightness and price: 0.50
Portability Features¶
Weight¶
Basic analysis
InĀ [61]:
# Print summary statistics for weight
weight_stats = full_relation['laptop_specs_weight'].describe()
print("Summary Statistics for Weight:")
print(weight_stats)
Summary Statistics for Weight: count 1577.000000 mean 1.728397 std 0.415082 min 0.879000 25% 1.400000 50% 1.650000 75% 2.000000 max 4.000000 Name: laptop_specs_weight, dtype: float64
InĀ [62]:
# Plot the distribution of laptop weights
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_weight'].dropna(), kde=True, color='blue', bins=30)
# Add labels and title
plt.title("Distribution of Laptop Weights", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# Show the plot
plt.show()
Analysis of weight with price
InĀ [63]:
# Calculate the correlation between weight and price
correlation_weight_price = full_relation['laptop_specs_weight'].corr(full_relation['laptop_specs_price'])
# Print the correlation result
print(f"Correlation between weight and price: {correlation_weight_price:.2f}")
Correlation between weight and price: 0.17
InĀ [64]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_weight', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})
# Add titles and labels
plt.title("Weight vs Price", fontsize=16)
plt.xlabel("Weight (kg)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Length, Width, Height¶
Basic analysis
InĀ [65]:
# Calculate summary statistics for length, width, and height
length_stats = full_relation['laptop_specs_height'].describe()
width_stats = full_relation['laptop_specs_width'].describe()
height_stats = full_relation['laptop_specs_depth'].describe()
# Print the results
print("Summary Statistics for Length:")
print(length_stats)
print("\nSummary Statistics for Width:")
print(width_stats)
print("\nSummary Statistics for Height:")
print(height_stats)
Summary Statistics for Length: count 1498.000000 mean 1.907049 std 0.647121 min 0.930000 25% 1.690000 50% 1.830000 75% 1.990000 max 22.700000 Name: laptop_specs_height, dtype: float64 Summary Statistics for Width: count 1498.000000 mean 34.120287 std 2.341405 min 28.700000 25% 31.560000 50% 35.610000 75% 35.940000 max 50.500000 Name: laptop_specs_width, dtype: float64 Summary Statistics for Height: count 1498.000000 mean 23.461589 std 1.766152 min 3.000000 25% 22.100000 50% 23.500000 75% 24.770000 max 31.600000 Name: laptop_specs_depth, dtype: float64
InĀ [66]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Plot the distribution of length
sns.histplot(full_relation['laptop_specs_height'].dropna(), kde=True, color='blue', bins=30, ax=axes[0])
axes[0].set_title("Distribution of Laptop Length", fontsize=16)
axes[0].set_xlabel("Length (cm)", fontsize=14)
axes[0].set_ylabel("Frequency", fontsize=14)
# Plot the distribution of width
sns.histplot(full_relation['laptop_specs_width'].dropna(), kde=True, color='green', bins=30, ax=axes[1])
axes[1].set_title("Distribution of Laptop Width", fontsize=16)
axes[1].set_xlabel("Width (cm)", fontsize=14)
axes[1].set_ylabel("Frequency", fontsize=14)
# Plot the distribution of height
sns.histplot(full_relation['laptop_specs_depth'].dropna(), kde=True, color='red', bins=30, ax=axes[2])
axes[2].set_title("Distribution of Laptop Height", fontsize=16)
axes[2].set_xlabel("Height (cm)", fontsize=14)
axes[2].set_ylabel("Frequency", fontsize=14)
plt.tight_layout()
plt.show()
Analysis of dimensions with price
InĀ [67]:
# Calculate the correlation between length, width, height, and price
correlation_length_price = full_relation['laptop_specs_height'].corr(full_relation['laptop_specs_price'])
correlation_width_price = full_relation['laptop_specs_width'].corr(full_relation['laptop_specs_price'])
correlation_height_price = full_relation['laptop_specs_depth'].corr(full_relation['laptop_specs_price'])
# Print the correlation results
print(f"Correlation between length and price: {correlation_length_price:.2f}")
print(f"Correlation between width and price: {correlation_width_price:.2f}")
print(f"Correlation between height and price: {correlation_height_price:.2f}")
Correlation between length and price: -0.00 Correlation between width and price: -0.11 Correlation between height and price: 0.13
InĀ [68]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Plot length vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_height', y='laptop_specs_price', ax=axes[0], color='blue', scatter_kws={'s': 10})
axes[0].set_title('Length vs Price')
axes[0].set_xlabel('Length (cm)')
axes[0].set_ylabel('Price (VND)')
# Plot width vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_width', y='laptop_specs_price', ax=axes[1], color='green', scatter_kws={'s': 10})
axes[1].set_title('Width vs Price')
axes[1].set_xlabel('Width (cm)')
axes[1].set_ylabel('Price (VND)')
# Plot height vs price with regression line
sns.regplot(data=full_relation, x='laptop_specs_depth', y='laptop_specs_price', ax=axes[2], color='red', scatter_kws={'s': 10})
axes[2].set_title('Height vs Price')
axes[2].set_xlabel('Height (cm)')
axes[2].set_ylabel('Price (VND)')
plt.tight_layout()
plt.show()
InĀ [69]:
# Calculate the product of length, width, and height
full_relation_clone['volume'] = full_relation_clone['laptop_specs_height'] * full_relation_clone['laptop_specs_width'] * full_relation_clone['laptop_specs_depth']
# Calculate the correlation between volume and price
correlation_volume_price = full_relation_clone['volume'].corr(full_relation_clone['laptop_specs_price'])
# Print the correlation result
print(f"Correlation between volume and price: {correlation_volume_price:.2f}")
# Plot the correlation between volume and price
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation_clone, x='volume', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})
# Add titles and labels
plt.title("Volume vs Price", fontsize=16)
plt.xlabel("Volume (cm³)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Correlation between volume and price: 0.03
Battery and Power¶
Basic Analysis¶
InĀ [70]:
# Calculate summary statistics for battery amount and battery cells
battery_amount_stats = full_relation['laptop_specs_battery_capacity'].describe()
battery_cells_stats = full_relation['laptop_specs_battery_cells'].describe()
# Print the results
print("Summary Statistics for Battery Capacity:")
print(battery_amount_stats)
print("\nSummary Statistics for Battery Cells:")
print(battery_cells_stats)
Summary Statistics for Battery Capacity: count 1707.000000 mean 58.177510 std 23.062029 min 36.000000 25% 47.000000 50% 55.000000 75% 65.000000 max 800.000000 Name: laptop_specs_battery_capacity, dtype: float64 Summary Statistics for Battery Cells: count 1236.000000 mean 3.442557 std 0.653190 min 2.000000 25% 3.000000 50% 3.000000 75% 4.000000 max 6.000000 Name: laptop_specs_battery_cells, dtype: float64
InĀ [71]:
# Plot the distribution of battery capacity
plt.figure(figsize=(10, 6))
sns.histplot(full_relation['laptop_specs_battery_capacity'].dropna(), kde=True, color='blue', bins=30)
# Add labels and title
plt.title("Distribution of Laptop Battery Capacity", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# Show the plot
plt.show()
Analysis of battery and power features with price
InĀ [72]:
# Calculate the correlation between battery capacity and price
correlation_battery_capacity_price = full_relation['laptop_specs_battery_capacity'].corr(full_relation['laptop_specs_price'])
# Calculate the correlation between battery cells and price
correlation_battery_cells_price = full_relation['laptop_specs_battery_cells'].corr(full_relation['laptop_specs_price'])
# Print the correlation results
print(f"Correlation between battery capacity and price: {correlation_battery_capacity_price:.2f}")
print(f"Correlation between battery cells and price: {correlation_battery_cells_price:.2f}")
Correlation between battery capacity and price: 0.44 Correlation between battery cells and price: 0.59
InĀ [73]:
# Set the plot style
sns.set_theme(style="whitegrid")
# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(data=full_relation, x='laptop_specs_battery_capacity', y='laptop_specs_price', color='blue', scatter_kws={'s': 10})
# Add titles and labels
plt.title("Battery Capacity vs Price", fontsize=16)
plt.xlabel("Battery Capacity (Wh)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Connectivity Features¶
Basic analysis¶
InĀ [74]:
# Print unique values and their counts for number of USB-A ports
usb_a_counts = full_relation['laptop_specs_number_usb_a_ports'].value_counts()
print("Unique values and counts for number of USB-A ports:")
print(usb_a_counts)
# Print unique values and their counts for number of USB-C ports
usb_c_counts = full_relation['laptop_specs_number_usb_c_ports'].value_counts()
print("\nUnique values and counts for number of USB-C ports:")
print(usb_c_counts)
# Print unique values and their counts for number of HDMI ports
hdmi_counts = full_relation['laptop_specs_number_hdmi_ports'].value_counts()
print("\nUnique values and counts for number of HDMI ports:")
print(hdmi_counts)
# Print unique values and their counts for number of Ethernet ports
ethernet_counts = full_relation['laptop_specs_number_ethernet_ports'].value_counts()
print("\nUnique values and counts for number of Ethernet ports:")
print(ethernet_counts)
# Print unique values and their counts for number of audio jacks
audio_jack_counts = full_relation['laptop_specs_number_audio_jacks'].value_counts()
print("\nUnique values and counts for number of audio jacks:")
print(audio_jack_counts)
Unique values and counts for number of USB-A ports: laptop_specs_number_usb_a_ports 0.0 850 2.0 441 3.0 241 1.0 171 4.0 41 6.0 4 5.0 3 12.0 3 8.0 2 Name: count, dtype: int64 Unique values and counts for number of USB-C ports: laptop_specs_number_usb_c_ports 1.0 831 2.0 411 0.0 404 4.0 50 3.0 32 8.0 22 5.0 6 Name: count, dtype: int64 Unique values and counts for number of HDMI ports: laptop_specs_number_hdmi_ports 1.0 1517 0.0 239 Name: count, dtype: int64 Unique values and counts for number of Ethernet ports: laptop_specs_number_ethernet_ports 0.0 1470 1.0 286 Name: count, dtype: int64 Unique values and counts for number of audio jacks: laptop_specs_number_audio_jacks 0.0 1039 1.0 717 Name: count, dtype: int64
InĀ [75]:
import matplotlib.pyplot as plt
import seaborn as sns
# Set up the figure and axes
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()
# Plot the pie chart for number of USB-A ports
usb_a_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
ax=axes[0],
colors=sns.color_palette('pastel', len(usb_a_counts)),
labels=None # Remove labels
)
axes[0].set_title("Distribution of USB-A Ports")
axes[0].set_ylabel('')
axes[0].legend(usb_a_counts.index, title="USB-A Ports", loc="best")
# Plot the pie chart for number of USB-C ports
usb_c_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
ax=axes[1],
colors=sns.color_palette('pastel', len(usb_c_counts)),
labels=None # Remove labels
)
axes[1].set_title("Distribution of USB-C Ports")
axes[1].set_ylabel('')
axes[1].legend(usb_c_counts.index, title="USB-C Ports", loc="best")
# Plot the pie chart for number of HDMI ports
hdmi_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
ax=axes[2],
colors=sns.color_palette('pastel', len(hdmi_counts)),
labels=None # Remove labels
)
axes[2].set_title("Distribution of HDMI Ports")
axes[2].set_ylabel('')
axes[2].legend(hdmi_counts.index, title="HDMI Ports", loc="best")
# Plot the pie chart for number of Ethernet ports
ethernet_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
ax=axes[3],
colors=sns.color_palette('pastel', len(ethernet_counts)),
labels=None # Remove labels
)
axes[3].set_title("Distribution of Ethernet Ports")
axes[3].set_ylabel('')
axes[3].legend(ethernet_counts.index, title="Ethernet Ports", loc="best")
# Plot the pie chart for number of Audio Jacks
audio_jack_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
ax=axes[4],
colors=sns.color_palette('pastel', len(audio_jack_counts)),
labels=None # Remove labels
)
axes[4].set_title("Distribution of Audio Jacks")
axes[4].set_ylabel('')
axes[4].legend(audio_jack_counts.index, title="Audio Jacks", loc="best")
# Remove the last empty subplot
fig.delaxes(axes[5])
# Adjust layout
plt.tight_layout()
plt.show()
Analysis connectivity to price¶
InĀ [76]:
# Calculate the correlation between connectivity features and price
correlation_usb_a_price = full_relation['laptop_specs_number_usb_a_ports'].corr(full_relation['laptop_specs_price'])
correlation_usb_c_price = full_relation['laptop_specs_number_usb_c_ports'].corr(full_relation['laptop_specs_price'])
correlation_hdmi_price = full_relation['laptop_specs_number_hdmi_ports'].corr(full_relation['laptop_specs_price'])
correlation_ethernet_price = full_relation['laptop_specs_number_ethernet_ports'].corr(full_relation['laptop_specs_price'])
correlation_audio_jack_price = full_relation['laptop_specs_number_audio_jacks'].corr(full_relation['laptop_specs_price'])
# Print the correlation results
print(f"Correlation between number of USB-A ports and price: {correlation_usb_a_price:.2f}")
print(f"Correlation between number of USB-C ports and price: {correlation_usb_c_price:.2f}")
print(f"Correlation between number of HDMI ports and price: {correlation_hdmi_price:.2f}")
print(f"Correlation between number of Ethernet ports and price: {correlation_ethernet_price:.2f}")
print(f"Correlation between number of audio jacks and price: {correlation_audio_jack_price:.2f}")
Correlation between number of USB-A ports and price: -0.14 Correlation between number of USB-C ports and price: 0.00 Correlation between number of HDMI ports and price: -0.18 Correlation between number of Ethernet ports and price: -0.04 Correlation between number of audio jacks and price: 0.03
Software Features¶
Default OS¶
Basic analysis
InĀ [77]:
# Print unique values and their counts for default OS
os_counts = full_relation['laptop_specs_default_os'].value_counts()
# Replace 'window' with 'windows' in the 'laptop_specs_default_os' column
full_relation['laptop_specs_default_os'] = full_relation['laptop_specs_default_os'].apply(lambda x: 'windows' if x is not None and 'window' in x.lower() else x)
# Print the updated unique OS and their counts
os_counts = full_relation['laptop_specs_default_os'].value_counts()
print("Unique OS and their counts:")
print(os_counts)
Unique OS and their counts: laptop_specs_default_os windows 1703 macos 140 linux 30 chrome os 2 Name: count, dtype: int64
InĀ [78]:
# Plot the pie chart for default OS
plt.figure(figsize=(8, 8))
os_counts.plot(
kind='pie',
autopct='%1.1f%%',
startangle=140,
colors=['#66b3ff', '#99ff99', '#ffcc99', '#ff9999'],
labels=None, # Remove labels from the pie chart
wedgeprops=dict(width=0.3),
textprops={'fontsize': 10} # Adjust text size
)
# Add a legend for categories
plt.legend(os_counts.index, loc="best")
# Add title
plt.title("Distribution of Default OS", fontsize=16)
# Show the plot
plt.show()
Warranty¶
InĀ [79]:
# Print unique values and their counts for warranty
warranty_counts = full_relation['laptop_specs_warranty'].value_counts()
print("Unique warranty values and their counts:")
print(warranty_counts)
Unique warranty values and their counts: laptop_specs_warranty 12.0 851 24.0 765 36.0 76 18.0 1 Name: count, dtype: int64
InĀ [80]:
# Print correlation
correlation_warranty_price = full_relation['laptop_specs_warranty'].corr(full_relation['laptop_specs_price'])
print(f"Correlation between warranty and price: {correlation_warranty_price:.2f}")
# Set the plot style
sns.set_theme(style="whitegrid")
# Create a boxplot for price distribution by warranty
plt.figure(figsize=(14, 8))
sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Price Distribution by Warranty", fontsize=16)
plt.xlabel("Warranty (months)", fontsize=14)
plt.ylabel("Price (VND)", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
Correlation between warranty and price: 0.05
/tmp/ipykernel_670/2459959301.py:10: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation, x='laptop_specs_warranty', y='laptop_specs_price', palette="viridis")
Target Feature: price¶
Basic statistics
InĀ [81]:
# Calculate basic statistics for the price column
price_stats = full_relation['laptop_specs_price'].describe()
# Print the statistics
print("Basic Statistics for Price:")
print(price_stats)
Basic Statistics for Price: count 1.959000e+03 mean 2.532114e+07 std 1.386554e+07 min 7.890000e+06 25% 1.699000e+07 50% 2.189000e+07 75% 2.939000e+07 max 1.824900e+08 Name: laptop_specs_price, dtype: float64
Visualizing the distribution
InĀ [82]:
# Plot the distribution of laptop prices
plt.figure(figsize=(12, 6))
sns.histplot(full_relation['laptop_specs_price'], kde=True, color='blue', bins=30)
# Add labels and title
plt.title("Distribution of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# Show the plot
plt.show()
InĀ [83]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")
# Add titles and labels
plt.title("Boxplot of Laptop Prices", fontsize=16)
plt.xlabel("Price (VND)", fontsize=14)
plt.grid(True)
# Show the plot
plt.tight_layout()
plt.show()
/tmp/ipykernel_670/2849463995.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(data=full_relation, x='laptop_specs_price', palette="viridis")